import pandas as pd
import os
import plotly.graph_objects as go
import plotly.io as pio
import plotly.offline as py
from scipy.optimize import curve_fit
from datetime import datetime
import re
pd.set_option("display.max_rows", 250)
pio.renderers.default = 'notebook'
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%d-%m-%Y %H:%M")
print("Updated on", current_time, "h")
################################ Loading xls
url = "https://www.ecdc.europa.eu/sites/default/files/documents/COVID-19-geographic-disbtribution-worldwide.xlsx"
cases = pd.read_excel(url)
#filepath = "C:\\Users\\edidd\\Documents\\Ubiqum\\Data Analytics Course\\covid19\\data\\"
#cases = pd.read_excel(os.path.join(filepath, "COVID-19-geographic-disbtribution-worldwide-2020-04-23.xlsx"))
# filepath2 = "C:\\Users\\edidd\\Documents\\covid19_JHU_CSSE\\COVID-19\\"
# data_jhu= pd.read_csv(os.path.join(filepath2, "who_covid_19_situation_reports\\who_covid_19_sit_rep_time_series\\who_covid_19_sit_rep_time_series.csv"))
############################# Data wrangling
cases.dateRep = pd.to_datetime(cases.dateRep, format="%Y-%m-%d")
cases = cases.rename(columns= {
"dateRep": "date",
"countriesAndTerritories": "country",
"cases": "newcases",
"deaths": "newdeaths"})
cases = cases.sort_values(by=["country", "date"])
cases["cumcases"]= cases.groupby("country")["newcases"].cumsum()
######################### Deleting rows with 0 cumulative cases
cases= cases.loc[cases.cumcases != 0,:]
cases= cases.set_index("country")
# Grouping by country
cases_total= cases.groupby(["country"]).sum().sort_values(by= ["newcases"], ascending= False)
cases_total= cases_total.rename(columns= {"newcases": "total_cases", "newdeaths": "total_deaths"})
cases_total= cases_total.drop(["day", "month", "year", "popData2019", "cumcases"], axis= "columns")
cases_total["death_ratio"]= cases_total.total_deaths / cases_total.total_cases * 100
cases_total["population_2019"]= cases.groupby("country").max().popData2019
cases_total["cases_to_population"]= cases_total.total_cases / cases_total.population_2019 * 100
cases_total["cases_to_population"]= cases_total["cases_to_population"].map('{:,.2f}%'.format)
cases_total["death_ratio"]= cases_total["death_ratio"].map('{:,.2f}%'.format)
fig = go.Figure(data= [go.Bar(x= cases_total.index,
y= cases_total.total_cases.head(10),
text= "*" + cases_total.cases_to_population,
textposition='auto')
]
)
fig.update_layout(title= "Total confirmed cases per country - top 10",
xaxis_title= "* Showing percentage of country's population")
fig.show()
morethan1000 = cases_total.loc[cases_total.total_cases >= 1000, :].copy()
morethan1000["death_ratio"] = morethan1000.total_deaths / morethan1000.total_cases
fig= go.Figure()
fig.add_trace(go.Histogram(x= morethan1000.death_ratio,
histnorm= "probability",
xbins=dict(
start=0,
end=morethan1000.death_ratio.max(),
size=.01
)
)
)
fig.update_layout(title= "Global distribution of death ratio",
xaxis=dict(tickformat= "%", tickangle=0,
title='* Only countries with more than 999 reported cases are considered'))
pio.show(fig)
Complete list
Trends shown for countries with more than 999 reported cases. Double-click on a country name in the legend next to each figure, in order to show only the selected country.
cases_total= cases_total.sort_values("country")
country_list= ["Italy", "Germany", "Spain", "South_Korea", "United_States_of_America", "China", "Iran", "France", "United_Kingdom", "Singapore", "Australia", "Ecuador"]
country_list= cases_total.loc[cases_total.total_cases >= 1000, :].index
fig = go.Figure()
for i, country in enumerate(country_list):
fig.add_trace(go.Scatter(x= cases.loc[country].date,
y= cases.loc[country].cumcases,
mode= "markers",
name= country))
fig.update_layout(title="Cummulative cases per country")#, yaxis_type="log")
pio.show(fig)
# cases["growfactor"] = cases.newcases / cases.groupby("country")["newcases"].shift(1).fillna(0)
cases= cases.loc[cases.cumcases >= 20,:]
cases["datemin"] = cases.groupby("country")["date"].min()
cases["days"]= cases.date - cases.datemin
cases.days= cases.days.dt.days
def exponential_growth(x, a, c_o):
return c_o*a**x
def country_fit(df, country):
x= df.loc[country].days
y= df.loc[country].cumcases
popt, pcov= curve_fit(exponential_growth, x, y)
return popt
# country_list= ["Italy", "Germany", "Spain", "South_Korea", "United_States_of_America", "Singapore"]
popt_list= []
popt_df= pd.DataFrame(columns= ["A"])
for i, country in enumerate(country_list):
popt_list.append(country_fit(cases, country))
popt_df.loc[country]= popt_list[i][0]
It can be interpreted as an average daily increase factor of total cases.
popt_df.sort_values("A", ascending= False)
fig= go.Figure()
for i, country in enumerate(country_list):
fig.add_trace(go.Scatter(x= cases.loc[country].days,
y= cases.loc[country].cumcases,
mode= "markers",
name= country))
fig.add_trace(go.Scatter(x= cases.loc[country].days,
y= exponential_growth(cases.loc[country].days, *popt_list[i]),
mode= "lines",
name= "exponential fit"))
fig.update_layout(title= "Exponential fit per country")
pio.show(fig)
A positive change means an increase in the number of reported cases. A negative change means a decrease in the number of new cases, that means good news!
country_list= ["Italy", "Germany", "Spain", "South_Korea", "United_States_of_America", "China", "Iran", "France", "United_Kingdom", "Singapore", "Australia", "Ecuador"]
country_list= cases_total.loc[cases_total.total_cases >= 1000, :].index
cases["growspeed"]= cases.newcases - cases.groupby("country").newcases.shift(1).fillna(0)
fig= go.Figure()
for i, country in enumerate(country_list):
fig.add_trace(go.Scatter(x= cases.loc[country].date,
y= cases.loc[country].growspeed,
mode= "markers",
name= country))
fig.update_layout(title= "Daily change in number of new cases")
pio.show(fig)
#### Grow speed with different aggregations
cases= cases.reset_index()
cases= cases.set_index("date")
cases_2d= cases.groupby("country").resample(pd.Timedelta(days= 2)).sum().loc[:,["newcases"]]
cases_3d= cases.groupby("country").resample(pd.Timedelta(days= 3)).sum().loc[:,["newcases"]]
cases_4d= cases.groupby("country").resample(pd.Timedelta(days= 4)).sum().loc[:,["newcases"]]
cases_7d= cases.groupby("country").resample(pd.Timedelta(days= 7)).sum().loc[:,["newcases"]]
# Grow speed
cases_2d["growspeed"]= cases_2d.newcases - cases_2d.groupby("country").newcases.shift(1).fillna(0)
fig= go.Figure()
for i, country in enumerate(country_list):
fig.add_trace(go.Scatter(x= cases_2d.loc[country].index,
y= cases_2d.loc[country].growspeed,
mode= "markers",
name= country))
fig.update_layout(title= "Change in the number of new cases (2 days period)")
pio.show(fig)
cases_3d["growspeed"]= cases_3d.newcases - cases_3d.groupby("country").newcases.shift(1).fillna(0)
fig= go.Figure()
for i, country in enumerate(country_list):
fig.add_trace(go.Scatter(x= cases_3d.loc[country].index,
y= cases_3d.loc[country].growspeed,
mode= "markers",
name= country))
fig.update_layout(title= "Change in the number of new cases (3 days period)")
pio.show(fig)
cases_4d["growspeed"]= cases_4d.newcases - cases_4d.groupby("country").newcases.shift(1).fillna(0)
fig= go.Figure()
for i, country in enumerate(country_list):
fig.add_trace(go.Scatter(x= cases_4d.loc[country].index,
y= cases_4d.loc[country].growspeed,
mode= "markers",
name= country))
fig.update_layout(title= "Change in the number of new cases (4 days period)")
pio.show(fig)
cases_7d["growspeed"]= cases_7d.newcases - cases_7d.groupby("country").newcases.shift(1).fillna(0)
fig= go.Figure()
for i, country in enumerate(country_list):
fig.add_trace(go.Scatter(x= cases_7d.loc[country].index,
y= cases_7d.loc[country].growspeed,
mode= "markers",
name= country))
fig.update_layout(title= "Change in the number of new cases (1 week period)")
pio.show(fig)
cases= cases.reset_index()
cases= cases.set_index("country")
#!jupyter nbconvert --to html --template toc2 EAP.ipynb